{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PhraseId      156060\n",
      "SentenceId    156060\n",
      "Phrase        156060\n",
      "Sentiment     156060\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_csv('./train.tsv', header=0, delimiter='\\t')\n",
    "print(df.count())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   PhraseId  SentenceId                                             Phrase  \\\n",
      "0         1           1  A series of escapades demonstrating the adage ...   \n",
      "1         2           1  A series of escapades demonstrating the adage ...   \n",
      "2         3           1                                           A series   \n",
      "3         4           1                                                  A   \n",
      "4         5           1                                             series   \n",
      "\n",
      "   Sentiment  \n",
      "0          1  \n",
      "1          2  \n",
      "2          2  \n",
      "3          2  \n",
      "4          2  \n"
     ]
    }
   ],
   "source": [
    "print(df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0    A series of escapades demonstrating the adage ...\n",
      "1    A series of escapades demonstrating the adage ...\n",
      "2                                             A series\n",
      "3                                                    A\n",
      "4                                               series\n",
      "5    of escapades demonstrating the adage that what...\n",
      "6                                                   of\n",
      "7    escapades demonstrating the adage that what is...\n",
      "8                                            escapades\n",
      "9    demonstrating the adage that what is good for ...\n",
      "Name: Phrase, dtype: object\n"
     ]
    }
   ],
   "source": [
    "print(df['Phrase'].head(10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "count    156060.000000\n",
      "mean          2.063578\n",
      "std           0.893832\n",
      "min           0.000000\n",
      "25%           2.000000\n",
      "50%           2.000000\n",
      "75%           3.000000\n",
      "max           4.000000\n",
      "Name: Sentiment, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "print(df['Sentiment'].describe())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2    79582\n",
      "3    32927\n",
      "1    27273\n",
      "4     9206\n",
      "0     7072\n",
      "Name: Sentiment, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "print(df['Sentiment'].value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2    0.509945\n",
      "3    0.210989\n",
      "1    0.174760\n",
      "4    0.058990\n",
      "0    0.045316\n",
      "Name: Sentiment, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "print(df['Sentiment'].value_counts()/df['Sentiment'].count())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 3 folds for each of 24 candidates, totalling 72 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.5min\n",
      "[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  3.3min finished\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best score: 0.617\n",
      "Best parameters set:\n",
      "tclf__C: 10\n",
      "tvect__max_df: 0.25\n",
      "tvect__ngram_range: (1, 2)\n",
      "tvect__use_idf: False\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.linear_model.logistic import LogisticRegression\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import classification_report, accuracy_score, confusion_matrix\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "\n",
    "df = pd.read_csv('./train.tsv', header=0, delimiter='\\t')\n",
    "X, y = df['Phrase'], df['Sentiment'].as_matrix()\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)\n",
    "grid_search = main(X_train, y_train)\n",
    "pipeline = Pipeline([\n",
    "    ('vect', TfidfVectorizer(stop_words='english')),\n",
    "    ('clf', LogisticRegression())\n",
    "])\n",
    "parameters = {\n",
    "    'vect__max_df': (0.25, 0.5),\n",
    "    'vect__ngram_range': ((1, 1), (1, 2)),\n",
    "    'vect__use_idf': (True, False),\n",
    "    'clf__C': (0.1, 1, 10),\n",
    "}\n",
    "grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')\n",
    "grid_search.fit(X_train, y_train)\n",
    "print('Best score: %0.3f' % grid_search.best_score_)\n",
    "print('Best parameters set:')\n",
    "best_parameters = grid_search.best_estimator_.get_params()\n",
    "for param_name in sorted(parameters.keys()):\n",
    "    print('t%s: %r' % (param_name, best_parameters[param_name]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.636255286428\n",
      "Confusion Matrix:\n",
      "[[ 1124  1725   628    65    10]\n",
      " [  923  6049  6132   583    34]\n",
      " [  197  3131 32658  3640   137]\n",
      " [   15   398  6530  8234  1301]\n",
      " [    3    43   530  2358  1582]]\n",
      "Classification Report:\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.50      0.32      0.39      3552\n",
      "          1       0.53      0.44      0.48     13721\n",
      "          2       0.70      0.82      0.76     39763\n",
      "          3       0.55      0.50      0.53     16478\n",
      "          4       0.52      0.35      0.42      4516\n",
      "\n",
      "avg / total       0.62      0.64      0.62     78030\n",
      "\n"
     ]
    }
   ],
   "source": [
    "predictions = grid_search.predict(X_test)\n",
    "\n",
    "print('Accuracy: %s' % accuracy_score(y_test, predictions))\n",
    "print('Confusion Matrix:')\n",
    "print(confusion_matrix(y_test, predictions))\n",
    "print('Classification Report:')\n",
    "print(classification_report(y_test, predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
